# Fix outliers
outlierKD2 <- function(df, var, rm=FALSE) {
#' Original outlierKD functino by By Klodian Dhana,
#' https://www.r-bloggers.com/identify-describe-plot-and-remove-the-outliers-from-the-dataset/
#' Modified to have third argument for removing outliers instead of interactive prompt,
#' and after removing outlier, original df will not be changed. The function returns the a df,
#' which can be saved as original df name if desired.
#' Also added QQ-plot in the output.
#' Check outliers, and option to remove them, save as a new dataframe.
#' @param df The dataframe.
#' @param var The variable in the dataframe to be checked for outliers
#' @param rm Boolean. Whether to remove outliers or not.
#' @return The dataframe with outliers replaced by NA if rm==TRUE, or df if nothing changed
#' @examples
#' outlierKD2(mydf, height, FALSE)
#' mydf = outlierKD2(mydf, height, TRUE)
#' mydfnew = outlierKD2(mydf, height, TRUE)
dt = df # duplicate the dataframe for potential alteration
var_name <- eval(substitute(var),eval(dt))
na1 <- sum(is.na(var_name))
m1 <- mean(var_name, na.rm = T)
par(mfrow=c(2, 2), oma=c(0,0,3,0))
boxplot(var_name, main="With outliers")
hist(var_name, main="With outliers", xlab=NA, ylab=NA)
qqnorm(var_name, main = "With outliers")
qqline(var_name)
outlier <- boxplot.stats(var_name)$out
mo <- mean(outlier)
var_name <- ifelse(var_name %in% outlier, NA, var_name)
boxplot(var_name, main="Without outliers")
hist(var_name, main="Without outliers", xlab=NA, ylab=NA)
qqnorm(var_name, main = "Without outliers")
qqline(var_name)
title("Outlier Check", outer=TRUE)
na2 <- sum(is.na(var_name))
cat("Outliers identified:", na2 - na1, "\n")
cat("Propotion (%) of outliers:", round((na2 - na1) / sum(!is.na(var_name))*100, 1), "\n")
cat("Mean of the outliers:", round(mo, 2), "\n")
m2 <- mean(var_name, na.rm = T)
cat("Mean without removing outliers:", round(m1, 2), "\n")
cat("Mean if we remove outliers:", round(m2, 2), "\n")
# response <- readline(prompt="Do you want to remove outliers and to replace with NA? [yes/no]: ")
# if(response == "y" | response == "yes"){
if(rm){
dt[as.character(substitute(var))] <- invisible(var_name)
#assign(as.character(as.list(match.call())$dt), dt, envir = .GlobalEnv)
cat("Outliers successfully removed", "\n")
return(invisible(dt))
} else {
cat("Nothing changed", "\n")
return(invisible(df))
}
}
loadPkg("xtable")
loadPkg("kableExtra")
loadPkg("stringi")
xkabledply = function(modelsmmrytable, title="Table", digits = 4, pos="left", bso="striped", wide=FALSE) {
#' Combining base::summary, xtable, and kableExtra, to easily display model summary.
#' wrapper for the base::summary function on model objects
#' ELo 202004 GWU DATS
#' version 1.2
#' @param modelsmmrytable This can be a generic table, a model object such as lm(), or the summary of a model object summary(lm())
#' @param title Title of table.
#' @param digits Number of digits to display
#' @param pos Position of table, c("left","center","right")
#' @param bso bootstrap_options = c("basic", "striped", "bordered", "hover", "condensed", "responsive")
#' @return HTML table for display
#' @examples
#' library("xtable")
#' library("kableExtra")
#' xkabledply( df, title="Table testing", pos="left", bso="hover" )
if (wide) { modelsmmrytable <- t(modelsmmrytable) }
modelsmmrytable %>%
xtable() %>%
kable(caption = title, digits = digits) %>%
kable_styling(bootstrap_options = bso, full_width = FALSE, position = pos)
}
xkablesummary = function(df, title="Table: Statistics summary.", digits = 4, pos="left", bso="striped") {
#' Combining base::summary, xtable, and kableExtra, to easily display numeric variable summary of dataframes.
#' ELo 202004 GWU DATS
#' version 1.2
#' @param df The dataframe.
#' @param title Title of table.
#' @param digits Number of digits to display
#' @param pos Position of table, c("left","center","right")
#' @param bso bootstrap_options = c("basic", "striped", "bordered", "hover", "condensed", "responsive")
#' @return The HTML summary table for display, or for knitr to process into other formats
#' @examples
#' xkablesummary( faraway::ozone )
#' xkablesummary( ISLR::Hitters, title="Five number summary", pos="left", bso="hover" )
s = summary(df) %>%
apply( 2, function(x) stringr::str_remove_all(x,c("Min.\\s*:\\s*","1st Qu.\\s*:\\s*","Median\\s*:\\s*","Mean\\s*:\\s*","3rd Qu.\\s*:\\s*","Max.\\s*:\\s*")) ) %>% # replace all leading words
apply( 2, function(x) stringr::str_trim(x, "right")) # trim trailing spaces left
colnames(s) <- stringr::str_trim(colnames(s))
if ( dim(s)[1] ==6 ) { rownames(s) <- c('Min','Q1','Median','Mean','Q3','Max')
} else if ( dim(s)[1] ==7 ) { rownames(s) <- c('Min','Q1','Median','Mean','Q3','Max','NA') }
xkabledply(s, title=title, digits = digits, pos=pos, bso=bso )
}
xkablevif = function(model, title="VIFs of the model", digits = 3, pos="left", bso="striped", wide=FALSE) {
#' Combining faraway::vif, xtable, and kableExtra, to easily display numeric summary of VIFs for a model.
#' ELo 202004 GWU DATS
#' version 1.2
#' @param model The lm or compatible model object.
#' @param title Title of table.
#' @param digits Number of digits to display
#' @param pos Position of table, c("left","center","right")
#' @param bso bootstrap_options = c("basic", "striped", "bordered", "hover", "condensed", "responsive")
#' @param wide print table in long (FALSE) format or wide (TRUE) format
#' @return The HTML summary table of the VIFs for a model for display, or for knitr to process into other formats
#' @examples
#' xkablevif( lm(Salary~Hits+RBI, data=ISLR::Hitters, wide=T ) )
vifs = table( names(model$coefficients)[2:length(model$coefficients)] ) # remove intercept to set column names
vifs[] = faraway::vif(model) # set the values
if (wide) { vifs <- t(vifs) }
xkabledply( vifs, title=title, digits = digits, pos=pos, bso=bso )
}
Wine’s quality grade is affected by mant different factors. To better understand which factor could influence wine’s quality, we sttarted this project. The main objective of our study is to see whether these is an association between The importance of this subject arises from a reccurring topic that we see in the relationship between wine’s quality grade and wine’s description. This topic is brought up often but the question stands whether there is enough evidence to prove that wine’s quality grade and wine’s description is the same.
wine<-data.frame(read.csv("wine.csv"))
str(wine)
## 'data.frame': 4898 obs. of 12 variables:
## $ fixed.acidity : num 7 6.3 8.1 7.2 7.2 8.1 6.2 7 6.3 8.1 ...
## $ volatile.acidity : num 0.27 0.3 0.28 0.23 0.23 0.28 0.32 0.27 0.3 0.22 ...
## $ citric.acid : num 0.36 0.34 0.4 0.32 0.32 0.4 0.16 0.36 0.34 0.43 ...
## $ residual.sugar : num 20.7 1.6 6.9 8.5 8.5 6.9 7 20.7 1.6 1.5 ...
## $ chlorides : num 0.045 0.049 0.05 0.058 0.058 0.05 0.045 0.045 0.049 0.044 ...
## $ free.sulfur.dioxide : num 45 14 30 47 47 30 30 45 14 28 ...
## $ total.sulfur.dioxide: num 170 132 97 186 186 97 136 170 132 129 ...
## $ density : num 1.001 0.994 0.995 0.996 0.996 ...
## $ pH : num 3 3.3 3.26 3.19 3.19 3.26 3.18 3 3.3 3.22 ...
## $ sulphates : num 0.45 0.49 0.44 0.4 0.4 0.44 0.47 0.45 0.49 0.45 ...
## $ alcohol : num 8.8 9.5 10.1 9.9 9.9 10.1 9.6 8.8 9.5 11 ...
## $ quality : int 6 6 6 6 6 6 6 6 6 6 ...
length(wine$quality)
## [1] 4898
wine$quality=as.factor(wine$quality)
str(wine)
## 'data.frame': 4898 obs. of 12 variables:
## $ fixed.acidity : num 7 6.3 8.1 7.2 7.2 8.1 6.2 7 6.3 8.1 ...
## $ volatile.acidity : num 0.27 0.3 0.28 0.23 0.23 0.28 0.32 0.27 0.3 0.22 ...
## $ citric.acid : num 0.36 0.34 0.4 0.32 0.32 0.4 0.16 0.36 0.34 0.43 ...
## $ residual.sugar : num 20.7 1.6 6.9 8.5 8.5 6.9 7 20.7 1.6 1.5 ...
## $ chlorides : num 0.045 0.049 0.05 0.058 0.058 0.05 0.045 0.045 0.049 0.044 ...
## $ free.sulfur.dioxide : num 45 14 30 47 47 30 30 45 14 28 ...
## $ total.sulfur.dioxide: num 170 132 97 186 186 97 136 170 132 129 ...
## $ density : num 1.001 0.994 0.995 0.996 0.996 ...
## $ pH : num 3 3.3 3.26 3.19 3.19 3.26 3.18 3 3.3 3.22 ...
## $ sulphates : num 0.45 0.49 0.44 0.4 0.4 0.44 0.47 0.45 0.49 0.45 ...
## $ alcohol : num 8.8 9.5 10.1 9.9 9.9 10.1 9.6 8.8 9.5 11 ...
## $ quality : Factor w/ 7 levels "3","4","5","6",..: 4 4 4 4 4 4 4 4 4 4 ...
xkablesummary(wine)
| fixed.acidity | volatile.acidity | citric.acid | residual.sugar | chlorides | free.sulfur.dioxide | total.sulfur.dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Min | 3.80 | 0.080 | 0.000 | 0.6 | 0.009 | 2.0 | 9 | 0.987 | 2.72 | 0.22 | 8.0 | 3: 20 |
| Q1 | 6.30 | 0.210 | 0.270 | 1.7 | 0.036 | 23.0 | 108 | 0.992 | 3.09 | 0.41 | 9.5 | 4: 163 |
| Median | 6.80 | 0.260 | 0.320 | 5.2 | 0.043 | 34.0 | 134 | 0.994 | 3.18 | 0.47 | 10.4 | 5:1457 |
| Mean | 6.85 | 0.278 | 0.334 | 6.4 | 0.046 | 35.3 | 138 | 0.994 | 3.19 | 0.49 | 10.5 | 6:2198 |
| Q3 | 7.30 | 0.320 | 0.390 | 9.9 | 0.050 | 46.0 | 167 | 0.996 | 3.28 | 0.55 | 11.4 | 7: 880 |
| Max | 14.20 | 1.100 | 1.660 | 65.8 | 0.346 | 289.0 | 440 | 1.039 | 3.82 | 1.08 | 14.2 | 8: 175 |
| NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | 9: 5 |
qqnorm(wine$fixed.acidity, main="qq plot of wine's fixed acidity")
qqnorm(wine$volatile.acidity,main="qqplot of wine's volatile.acidity")
qqnorm(wine$citric.acid,main="qqplot of wine's citric.acid")
qqnorm(wine$residual.sugar,main="qqplot of wine's residual.sugar")
qqnorm(wine$chlorides,main="qqplot of wine's chlorides")
qqnorm(wine$free.sulfur.dioxide,main="qqplot of wine's free.sulfur.dioxide")
qqnorm(wine$total.sulfur.dioxide,main="qqplot of wine's total.sulfur.dioxide")
qqnorm(wine$density,main="qqplot of wine's density")
qqnorm(wine$pH,main="qqplot of wine's pH",xlab="pH")
qqnorm(wine$sulphates,main="qqplot of wine's sulphates")
qqnorm(wine$alcohol,main="qqplot of wine's alcohol")
wine<-outlierKD2(wine,fixed.acidity,TRUE)
## Outliers identified: 119
## Propotion (%) of outliers: 2.5
## Mean of the outliers: 8.81
## Mean without removing outliers: 6.85
## Mean if we remove outliers: 6.81
## Outliers successfully removed
wine<-outlierKD2(wine,volatile.acidity,TRUE)
## Outliers identified: 186
## Propotion (%) of outliers: 3.9
## Mean of the outliers: 0.59
## Mean without removing outliers: 0.28
## Mean if we remove outliers: 0.27
## Outliers successfully removed
wine<-outlierKD2(wine,citric.acid,TRUE)
## Outliers identified: 270
## Propotion (%) of outliers: 5.8
## Mean of the outliers: 0.49
## Mean without removing outliers: 0.33
## Mean if we remove outliers: 0.33
## Outliers successfully removed
wine<-outlierKD2(wine,residual.sugar,TRUE)
## Outliers identified: 7
## Propotion (%) of outliers: 0.1
## Mean of the outliers: 32.5
## Mean without removing outliers: 6.39
## Mean if we remove outliers: 6.35
## Outliers successfully removed
wine<-outlierKD2(wine,chlorides,TRUE)
## Outliers identified: 208
## Propotion (%) of outliers: 4.4
## Mean of the outliers: 0.12
## Mean without removing outliers: 0.05
## Mean if we remove outliers: 0.04
## Outliers successfully removed
wine<-outlierKD2(wine,free.sulfur.dioxide,TRUE)
## Outliers identified: 50
## Propotion (%) of outliers: 1
## Mean of the outliers: 101
## Mean without removing outliers: 35.3
## Mean if we remove outliers: 34.6
## Outliers successfully removed
wine<-outlierKD2(wine,total.sulfur.dioxide,TRUE)
## Outliers identified: 19
## Propotion (%) of outliers: 0.4
## Mean of the outliers: 226
## Mean without removing outliers: 138
## Mean if we remove outliers: 138
## Outliers successfully removed
wine<-outlierKD2(wine,density,TRUE)
## Outliers identified: 5
## Propotion (%) of outliers: 0.1
## Mean of the outliers: 1.01
## Mean without removing outliers: 0.99
## Mean if we remove outliers: 0.99
## Outliers successfully removed
wine<-outlierKD2(wine,pH,TRUE)
## Outliers identified: 75
## Propotion (%) of outliers: 1.6
## Mean of the outliers: 3.55
## Mean without removing outliers: 3.19
## Mean if we remove outliers: 3.18
## Outliers successfully removed
wine<-outlierKD2(wine,sulphates,TRUE)
## Outliers identified: 124
## Propotion (%) of outliers: 2.6
## Mean of the outliers: 0.84
## Mean without removing outliers: 0.49
## Mean if we remove outliers: 0.48
## Outliers successfully removed
wine<-outlierKD2(wine,alcohol,TRUE)
## Outliers identified: 0
## Propotion (%) of outliers: 0
## Mean of the outliers: NaN
## Mean without removing outliers: 10.5
## Mean if we remove outliers: 10.5
## Outliers successfully removed